<?php
include 'db.php';

$start = 0;

$ret = null;

$affect = 0;

$lfp = fopen('log_drop_stop.txt', 'wr+');

while ($result = mysql_query("select * from `word_v2` order by `id` asc limit $start,1000")){
	$data = array();
	$update = array();
	if (!$result){
		break;
	}
	while ($row = mysql_fetch_assoc($result)){
		$word = $row['word'];
		$clean_word = clean($word);
		
		if (mysql_query("update `word_v2` set `word`='{$clean_word}' where `id`='{$row['id']}'")){
			$affect++;
			echo $clean_word;
			echo $row['id'], "\t\tchanged!", PHP_EOL; 
		}
		
		/*
		if (hasStop($word)){
			mysql_query("delete from `word_v2` where `id`='{$row['id']}'");
			$str = $row['id'] . "\t\tinclude stop word,droped." . PHP_EOL;
			fputs($lfp, $str, strlen($str));
			echo $str;
		}
		*/
		/*
		if(hasAlpha($word)){
			//mysql_query("delete from `word_v2` where `id`='{$row['id']}'");
			$str = $row['id'] . "\t\tinclude alpha or number,droped." . PHP_EOL;
			//fputs($lfp, $str, strlen($str));
			echo $str;
		}*/
		echo $start+=1000, 'completed', PHP_EOL;
	}
	mysql_free_result($result);
	//break;
}

//清除标点符号
function clean($word){
	//$word = preg_replace("/[－＝［］、‘；／。，｜？》《：“｛｝＋—）（＊…％￥＃·！～’”〕〈〉「」『』〖〗【】＜＞]+/", '', $word);
	$puncations = array('－','＝','［','］','、','‘','；','／','。','，','｜','？','》','《','：','“','｛','｝','＋','—','）','（','＊','…','％','￥','＃','·','！','～','’','”','〕','〈','〉','「','」','『','』','〖','〗','【','】','＜','＞');
	foreach ($puncations as $p){
		$word = str_replace($p, '', $word);
	}
	return $word;
}

//寻找停止词
function hasStop($word){
	$stops = array('给','的','说','对','在','和','是','被','最','所','那','这','有','将','你','会','与','他','为','不','没','很','了','啊','哦','呵','把','去');
	foreach ($stops as $stop){
		if (mb_strpos($word, $stop, 0, 'utf-8')!==false){
			return true;
		}
	}
	return false;
}

//查找是否包含英文字符
function hasAlpha($word){
	return preg_match("/[a-zA-Z0-9_-]/", $word);	
}